In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook"
import seaborn as sns
from plotly.subplots import make_subplots
from sklearn import metrics
In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
Evaluation uses relevant metrics and applies (repeated/nested) cross validation appropriately. Hyperparameter tuning is done, and models are clearly compared and interpreted.
Loading data¶
In [4]:
df = pd.read_csv('./dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv')
# Set once, apply to all
pio.templates.default = "plotly_white"
Data description¶
In [5]:
df.head()
Out[5]:
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
In [6]:
df.info()
df.shape
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null object dtypes: float64(1), int64(2), object(18) memory usage: 1.1+ MB
Out[6]:
(7043, 21)
In [7]:
df.dtypes
Out[7]:
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges object Churn object dtype: object
In [8]:
df.columns.values
Out[8]:
array(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
'TotalCharges', 'Churn'], dtype=object)
In [9]:
df = df.drop(['customerID'], axis=1)
df.head()
Out[9]:
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
Missing value¶
In [10]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors="coerce")
df.isna().sum()
Out[10]:
gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 11 Churn 0 dtype: int64
In [11]:
# Correlation of missingness (1 = missing, 0 = present)
missing_corr = df.isna().corr()
sns.heatmap(missing_corr, annot=True, cmap='coolwarm')
plt.title('Correlation of Missingness')
plt.show()
In [12]:
df[np.isnan(df['TotalCharges'])]
Out[12]:
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 488 | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | No | Yes | Yes | Yes | No | Two year | Yes | Bank transfer (automatic) | 52.55 | NaN | No |
| 753 | Male | 0 | No | Yes | 0 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.25 | NaN | No |
| 936 | Female | 0 | Yes | Yes | 0 | Yes | No | DSL | Yes | Yes | Yes | No | Yes | Yes | Two year | No | Mailed check | 80.85 | NaN | No |
| 1082 | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.75 | NaN | No |
| 1340 | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | Yes | Yes | Yes | Yes | No | Two year | No | Credit card (automatic) | 56.05 | NaN | No |
| 3331 | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 19.85 | NaN | No |
| 3826 | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.35 | NaN | No |
| 4380 | Female | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.00 | NaN | No |
| 5218 | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | One year | Yes | Mailed check | 19.70 | NaN | No |
| 6670 | Female | 0 | Yes | Yes | 0 | Yes | Yes | DSL | No | Yes | Yes | Yes | Yes | No | Two year | No | Mailed check | 73.35 | NaN | No |
| 6754 | Male | 0 | No | Yes | 0 | Yes | Yes | DSL | Yes | Yes | No | Yes | No | No | Two year | Yes | Bank transfer (automatic) | 61.90 | NaN | No |
totalCharges is 0 while tenure must be 0
In [13]:
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)
impute totalCharges with mean value
In [14]:
df.fillna(df['TotalCharges'].mean(), inplace=True)
In [15]:
df.isna().sum()
Out[15]:
gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
In [16]:
df["SeniorCitizen"] = df["SeniorCitizen"].map({0: "No", 1: "Yes"})
In [17]:
df["InternetService"].describe()
Out[17]:
count 7032 unique 3 top Fiber optic freq 3096 Name: InternetService, dtype: object
In [18]:
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols].describe()
Out[18]:
| tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|
| count | 7032.000000 | 7032.000000 | 7032.000000 |
| mean | 32.421786 | 64.798208 | 2283.300441 |
| std | 24.545260 | 30.085974 | 2266.771362 |
| min | 1.000000 | 18.250000 | 18.800000 |
| 25% | 9.000000 | 35.587500 | 401.450000 |
| 50% | 29.000000 | 70.350000 | 1397.475000 |
| 75% | 55.000000 | 89.862500 | 3794.737500 |
| max | 72.000000 | 118.750000 | 8684.800000 |
Visualization¶
Gender and churn distribution¶
In [19]:
# Create individual pie charts with px
fig_gender = px.pie(df, names='gender', title='Gender')
fig_churn = px.pie(df, names='Churn', title='Churn')
# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])
# Add traces from px figures
fig.add_trace(fig_gender.data[0], row=1, col=1)
fig.add_trace(fig_churn.data[0], row=1, col=2)
# Update for donut style
fig.update_traces(hole=0.4, hoverinfo="label+percent+name", textfont_size=16)
# Better title and annotations
fig.update_layout(
title_text="Gender and Churn Distributions",
title_x=0.5,
annotations=[
dict(text='Gender', x=0.18, y=0.5, font_size=20, showarrow=False),
dict(text='Churn', x=0.82, y=0.5, font_size=20, showarrow=False)
],
font=dict(size=14)
)
fig.show()
Churn rate by gender¶
In [20]:
df[df["Churn"] == "No"]["gender"].value_counts()
Out[20]:
gender Male 2619 Female 2544 Name: count, dtype: int64
In [21]:
# Create a crosstab
churn_gender = pd.crosstab(df['Churn'], df['gender'], normalize='index') * 100
# Plot
ax = churn_gender.plot(kind='bar', stacked=True, color=['#c2c2f0', '#ffb3e6'], figsize=(8, 6))
ax.set_ylabel('Percentage (%)')
ax.set_title('Churn Rate by Gender')
for container in ax.containers:
ax.bar_label(container, fmt='%.1f%%', label_type='center')
plt.legend(title='Gender')
plt.show()
In [22]:
churn_gender_counts = pd.crosstab(df['Churn'], df['gender'])
ax = churn_gender_counts.plot(kind='bar', color=['#c2c2f0', '#ffb3e6'], figsize=(8, 6))
ax.set_ylabel('Count')
ax.set_title('Number of Customers by Churn and Gender')
for container in ax.containers:
ax.bar_label(container)
plt.legend(title='Gender')
plt.show()
Contract distribution¶
In [23]:
fig = px.histogram(df, x="Churn", color="Contract", barmode="group", title="<b>Customer contract distribution<b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
Payment Method Distribution¶
In [24]:
labels = df['PaymentMethod'].unique()
values = df['PaymentMethod'].value_counts()
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_layout(title_text="<b>Payment Method Distribution</b>")
fig.show()
Payment for Churn¶
In [25]:
fig = px.histogram(df, x="Churn", color="PaymentMethod", title="Customer Payment Method distribution w.r.t. Churn")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
Internet Service and Gender of Churn¶
In [26]:
df["InternetService"].unique()
Out[26]:
array(['DSL', 'Fiber optic', 'No'], dtype=object)
In [27]:
df[df["gender"] == "Male"][["InternetService", "Churn"]].value_counts()
Out[27]:
InternetService Churn DSL No 992 Fiber optic No 910 No No 717 Fiber optic Yes 633 DSL Yes 240 No Yes 57 Name: count, dtype: int64
In [28]:
fig = go.Figure()
# Define categories
churn_labels = ['Churn:No', 'Churn:Yes']
genders = ['Female', 'Male']
# Data: [ [DSL_F, DSL_M], [Fiber_F, Fiber_M], [NoInternet_F, NoInternet_M] ] per churn group
data = {
'DSL': {
'Churn:No': [965, 992],
'Churn:Yes': [219, 240]
},
'Fiber optic': {
'Churn:No': [889, 910],
'Churn:Yes': [664, 633]
},
'No Internet': {
'Churn:No': [690, 717],
'Churn:Yes': [56, 57]
}
}
# Build x-axis labels: "Churn:No-Female", "Churn:No-Male".
x_labels = [f"{churn}-{gender}" for churn in churn_labels for gender in genders]
# Add a trace for each InternetService (stacked)
for service, churn_data in data.items():
y_values = []
for churn in churn_labels:
y_values.extend(churn_data[churn]) # [F, M] for this churn group
fig.add_trace(go.Bar(
x=x_labels,
y=y_values,
name=service,
text=y_values,
textposition='auto'
))
fig.update_layout(title_text="<b>Churn Distribution w.r.t. Internet Service and Gender</b>")
fig.show()
Dependents churn distribution¶
In [29]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(df, x="Churn", color="Dependents", barmode="group", title="<b>Dependents distribution</b>",
color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
Partner Churn¶
In [30]:
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
fig = px.histogram(df, x="Churn", color="Partner", barmode="group", title="<b>Chrun distribution w.r.t. Partners</b>",
color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
SeniorCitizen distribution¶
In [31]:
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
fig = px.histogram(df, x="Churn", color="SeniorCitizen", title="<b>Chrun distribution w.r.t. Senior Citizen</b>",
color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
Online security churn¶
In [32]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(df, x="Churn", color="OnlineSecurity", barmode="group", title="<b>Churn w.r.t Online Security</b>",
color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
paperless billing¶
In [33]:
color_map = {"Yes": '#FFA15A', "No": '#00CC96'}
fig = px.histogram(df, x="Churn", color="PaperlessBilling", title="<b>Chrun distribution w.r.t. Paperless Billing</b>",
color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
TechSupport distribution¶
In [34]:
fig = px.histogram(df, x="Churn", color="TechSupport", barmode="group",
title="<b>Chrun distribution w.r.t. TechSupport</b>", text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
In [35]:
color_map = {"Yes": '#00CC96', "No": '#B6E880'}
fig = px.histogram(df, x="Churn", color="PhoneService", title="<b>Chrun distribution w.r.t. Phone Service</b>",
color_discrete_map=color_map, text_auto=True)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()
In [36]:
sns.set_context("paper", font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'No')],
color="Red", fill=True)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == 'Yes')],
ax=ax, color="Blue", fill=True)
ax.legend(["Not Churn", "Churn"], loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn')
Out[36]:
Text(0.5, 1.0, 'Distribution of monthly charges by churn')
In [37]:
sns.set_context("paper", font_scale=1.1)
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'No')],
color="Red", fill=True)
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == 'Yes')],
ax=ax, color="Blue", fill=True)
ax.legend(["Not Churn", "Churn"], loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Charges')
ax.set_title('Distribution of monthly charges by churn')
Out[37]:
Text(0.5, 1.0, 'Distribution of monthly charges by churn')
In [38]:
fig = px.box(df, x='Churn', y='tenure')
# Update yaxis properties
fig.update_yaxes(title_text='Tenure (Months)', row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text='Churn', row=1, col=1)
# Update size and title
fig.update_layout(autosize=True, width=750, height=600,
title_font=dict(size=25, family='Courier'),
title='Tenure vs Churn',
)
fig.show()
In [39]:
plt.figure(figsize=(25, 10))
corr = df.apply(lambda x: pd.factorize(x)[0]).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2,
cmap='coolwarm', vmin=-1, vmax=1)
Data preprocessing¶
In [40]:
df.nunique()
Out[40]:
gender 2 SeniorCitizen 2 Partner 2 Dependents 2 tenure 72 PhoneService 2 MultipleLines 3 InternetService 3 OnlineSecurity 3 OnlineBackup 3 DeviceProtection 3 TechSupport 3 StreamingTV 3 StreamingMovies 3 Contract 3 PaperlessBilling 2 PaymentMethod 4 MonthlyCharges 1584 TotalCharges 6530 Churn 2 dtype: int64
In [41]:
for col in df.columns:
n_unique = df[col].nunique(dropna=True)
if n_unique < 5:
uniques = df[col].dropna().unique()
print(f"{col}: {list(uniques)}")
gender: ['Female', 'Male'] SeniorCitizen: ['No', 'Yes'] Partner: ['Yes', 'No'] Dependents: ['No', 'Yes'] PhoneService: ['No', 'Yes'] MultipleLines: ['No phone service', 'No', 'Yes'] InternetService: ['DSL', 'Fiber optic', 'No'] OnlineSecurity: ['No', 'Yes', 'No internet service'] OnlineBackup: ['Yes', 'No', 'No internet service'] DeviceProtection: ['No', 'Yes', 'No internet service'] TechSupport: ['No', 'Yes', 'No internet service'] StreamingTV: ['No', 'Yes', 'No internet service'] StreamingMovies: ['No', 'Yes', 'No internet service'] Contract: ['Month-to-month', 'One year', 'Two year'] PaperlessBilling: ['Yes', 'No'] PaymentMethod: ['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)'] Churn: ['No', 'Yes']
Encoding¶
In [42]:
binary_cols = ['Partner', 'Dependents', 'PhoneService',
'PaperlessBilling', 'Churn', 'SeniorCitizen']
for col in binary_cols:
df[col] = df[col].map({'No': 0, 'Yes': 1})
df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})
categorical_cols = [
'MultipleLines', 'InternetService', 'OnlineSecurity',
'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod'
]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=False)
In [43]:
churn_corr = df.corr()['Churn'].sort_values(ascending=False)
churn_corr = churn_corr.drop('Churn')
plt.figure(figsize=(16, 9))
# Use a diverging colormap: red (positive), white (0), blue (negative)
colors = plt.cm.RdYlBu_r((churn_corr + 1) / 2) # Normalize to [0,1] for colormap
bars = plt.bar(churn_corr.index, churn_corr.values, color=colors, edgecolor='black', linewidth=0.5)
plt.xticks(rotation=60, ha='right', fontsize=11)
plt.yticks(fontsize=11)
plt.ylabel('Correlation with Churn', fontsize=13)
plt.title('Feature Correlation with Churn (Higher = More Likely to Churn)', fontsize=16, weight='bold')
plt.axhline(0, color='gray', linewidth=0.8)
for bar, corr in zip(bars, churn_corr.values):
if abs(corr) > 0.1:
plt.text(
bar.get_x() + bar.get_width() / 2,
bar.get_height() + (0.01 if bar.get_height() >= 0 else -0.02),
f'{corr:.2f}',
ha='center', va='bottom' if bar.get_height() >= 0 else 'top',
fontsize=9, fontweight='bold'
)
plt.tight_layout()
plt.show()
Split train/test set¶
In [44]:
y = df['Churn'].values
X = df.drop(columns=['Churn'])
In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40, stratify=y)
In [46]:
def distplot(feature, frame, color='r'):
plt.figure(figsize=(8, 3))
plt.title("Distribution for {}".format(feature))
sns.histplot(frame[feature], color=color)
In [47]:
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
for feat in num_cols: distplot(feat, df)
In [48]:
df_std = pd.DataFrame(StandardScaler().fit_transform(df[num_cols].astype('float64')), columns=num_cols)
for feat in numerical_cols: distplot(feat, df_std, color='c')
In [49]:
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])
Model¶
In [50]:
def predict_result(model, model_name, X_test, y_test):
# Get best estimator
best = model.best_estimator_
# Predictions
predicted_y = best.predict(X_test)
y_pred_prob = best.predict_proba(X_test)[:, 1]
# Final metrics
accuracy = best.score(X_test, y_test)
auc = roc_auc_score(y_test, y_pred_prob)
print(f"\nFinal Test Performance {model_name}:")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, predicted_y))
return best, predicted_y
def show_confusion_matrix(test, predicted_y, model_name):
# Confusion Matrix
plt.figure(figsize=(4, 3))
sns.heatmap(
confusion_matrix(test, predicted_y),
annot=True,
fmt="d",
linecolor="k",
linewidths=3,
cmap="Blues"
)
plt.title(f"{model_name.upper()} CONFUSION MATRIX", fontsize=14)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
def show_ROC(model, y_test, model_name):
y_pred_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(6, 5))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
# use roc_auc_score
auc_test = roc_auc_score(y_test, y_pred_prob)
plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc_test:.3f})', color="r", linewidth=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'{model_name.upper()} ROC CURVE', fontsize=16)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
def importance_feature(best, model_name):
importances = best.feature_importances_
importance = pd.DataFrame({
'feature': X_train.columns.tolist(),
'importance': importances
}).sort_values(by='importance', ascending=False)
print(importance.head(10))
plt.figure(figsize=(10, 8))
sns.barplot(data=importance.head(10), x='importance', y='feature')
plt.title(f'{model_name} Feature Importances')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()
Knn¶
In [51]:
knn_model = KNeighborsClassifier(n_neighbors = 11)
knn_model.fit(X_train,y_train)
predict_knn_y = knn_model.predict(X_test)
accuracy_knn = knn_model.score(X_test,y_test)
print("KNN accuracy:",accuracy_knn)
print(classification_report(y_test, predict_knn_y))
KNN accuracy: 0.7767772511848341
precision recall f1-score support
0 0.84 0.86 0.85 1549
1 0.58 0.55 0.57 561
accuracy 0.78 2110
macro avg 0.71 0.71 0.71 2110
weighted avg 0.77 0.78 0.77 2110
In [52]:
show_confusion_matrix(y_test, predict_knn_y, "KNN")
show_ROC(knn_model, y_test, "KNN")
Svm¶
In [ ]:
svc_model = SVC(probability=True, random_state=1)
svc_model.fit(X_train, y_train)
# Make predictions
predict_svc_y = svc_model.predict(X_test)
# Evaluate accuracy
accuracy_svc = svc_model.score(X_test, y_test)
print("SVM accuracy is:", accuracy_svc)
print(classification_report(y_test, predict_svc_y))
In [122]:
show_confusion_matrix(y_test, predict_svc_y, "svm")
show_ROC(svc_model, y_test, "svm")
ramdom tree¶
In [123]:
rt_model = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=-1,
random_state=50, max_features="sqrt",
max_leaf_nodes=30)
rt_model.fit(X_train, y_train)
# Make predictions
prediction_rf_y = rt_model.predict(X_test)
# Print performance metrics
print(metrics.accuracy_score(y_test, prediction_rf_y))
print(classification_report(y_test, prediction_rf_y))
0.8085308056872038
precision recall f1-score support
0 0.84 0.92 0.88 1549
1 0.69 0.50 0.58 561
accuracy 0.81 2110
macro avg 0.76 0.71 0.73 2110
weighted avg 0.80 0.81 0.80 2110
In [124]:
show_confusion_matrix(y_test, prediction_rf_y, "random forest")
show_ROC(rt_model, y_test, "random forest")
logistic regression¶
In [125]:
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
prediction_lr_y = lr_model.predict(X_test)
accuracy_lr = lr_model.score(X_test, y_test)
print("Logistic Regression accuracy is :", accuracy_lr)
report = classification_report(y_test, prediction_lr_y)
print(report)
Logistic Regression accuracy is : 0.8113744075829384
precision recall f1-score support
0 0.85 0.90 0.87 1549
1 0.67 0.58 0.62 561
accuracy 0.81 2110
macro avg 0.76 0.74 0.75 2110
weighted avg 0.80 0.81 0.81 2110
In [126]:
show_confusion_matrix(y_test, prediction_lr_y, "logistic regression")
show_ROC(lr_model, y_test, "logistic regression")
Decision tree¶
In [127]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
prediction_dt_y = dt_model.predict(X_test)
accuracy_dt = dt_model.score(X_test, y_test)
print("Decision Tree accuracy is:", accuracy_dt)
print(classification_report(y_test, prediction_dt_y))
Decision Tree accuracy is: 0.7317535545023697
precision recall f1-score support
0 0.83 0.80 0.81 1549
1 0.50 0.55 0.52 561
accuracy 0.73 2110
macro avg 0.66 0.68 0.67 2110
weighted avg 0.74 0.73 0.74 2110
In [128]:
show_confusion_matrix(y_test, prediction_dt_y, "decision tree")
show_ROC(dt_model, y_test, "decision tree")
Compare models¶
In [129]:
models = {
'Logistic Regression': lr_model,
'KNN': knn_model,
'Random Forest': rt_model,
'Decision Tree': dt_model,
'SVM': svc_model
}
- Best Overall Performance: Logistic Regression achieves the highest Accuracy (0.8114) and F1-Score (0.6188), overall balance for this dataset.
- Best Discriminative Power: Random Forest has the highest AUC-ROC (0.8589), with raw accuracy or low recall.
- Best Precision: Random Forest also has the highest Precision (0.6938), nearly 70%.
- Best Recall: KNN has the highest Recall (0.5526), followed closely by Decision Tree (0.5544). These models are better at finding churners, but at the cost of low precision.
- Worst Performer: Decision Tree performs significantly worse than the others in almost every metric, especially AUC-ROC (0.6753). It may be overfitting to the training data.
In [130]:
results = []
for name, model in models.items():
# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)
results.append({
'Model': name,
'Accuracy': acc,
'Precision': prec,
'Recall': rec,
'F1-Score': f1,
'AUC-ROC': auc
})
# Create DataFrame
results_df = pd.DataFrame(results).round(4)
print(results_df)
Model Accuracy Precision Recall F1-Score AUC-ROC 0 Logistic Regression 0.8114 0.6687 0.5758 0.6188 0.8578 1 KNN 0.7768 0.5849 0.5526 0.5683 0.8183 2 Random Forest 0.8085 0.6938 0.5009 0.5818 0.8589 3 Decision Tree 0.7318 0.4960 0.5544 0.5236 0.6753 4 SVM 0.8090 0.6846 0.5223 0.5925 0.8059
- The purple bars (AUC-ROC) show Random Forest leading, followed by Logistic Regression and SVM.
- The blue bars (Accuracy) show Logistic Regression, SVM, and Random Forest clustered together at the top.
- The red bars (Recall) highlight KNN and Decision Tree as having relatively higher recall compared to their precision.
In [131]:
import plotly.express as px
# Melt for easy plotting
melted = results_df.melt(id_vars='Model',
value_vars=['Accuracy', 'Recall', 'F1-Score', 'AUC-ROC'],
var_name='Metric', value_name='Score')
fig = px.bar(melted, x='Model', y='Score', color='Metric', barmode='group',
title='Model Performance Comparison',
height=500, text_auto=True)
fig.update_layout(yaxis_range=[0, 1])
fig.show()
- All models perform significantly better than the random classifier.
- The Random Forest curve is consistently above the others, especially in the top-left region.
- The Logistic Regression curve is very close to Random Forest, similar performance.
- The Decision Tree curve is lower, especially in the middle range, poor AUC score.
In [132]:
plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
for name, model in models.items():
y_pred_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_test, y_pred_proba):.3f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(alpha=0.3)
plt.show()
- For a Balanced Approach: Logistic Regression. offers the best overall accuracy and F1-score, and its coefficients can be easily interpreted to understand which features drive churn.
- For Maximizing Detection (High Recall): KNN or Decision Tree, though be aware they have lower precision.
- For Minimizing False Positives (High Precision): Random Forest.
- For Best Overall Ranking Ability (Highest AUC): Random Forest.